/*******************************************************************************
*Project :       	Track customer spending using firm level spending data 

*Purpose : 		    Prepare Datasets for analysis 

*Source : 			Second Measure Web Platform https://secondmeasure.com/

*******************************************************************************/

clear all 
set more off 
capture log close 

* generate global date
global date=subinstr(c(current_date), " ", "_", .)
set obs 1 
gen date="${date}"
split date, parse("_")
cap replace date=date3+"_"+date2+"_"+date4 if date1==""
cap replace date=date2+"_"+date1+"_"+date3 if date1!=""
global date=date[1]
display as error "Today's date = ${date}"
 

*===============================================================================
* NAICS Codes Data 
*===============================================================================

* load data 
import delimited using "data/raw/second_measure/companies.csv" , clear varnames(1) 

* clean some variables 
  replace company  = lower(company) 
  
* get 6-digit NAICS code 
  gen naics6 = substr(naicscode,1,6) 

* keep only companies with non-missing NAICS code 
  unique(company) if !missing(naics6) 
  keep if !missing(naics6) 
  
  keep company naics6 
  ren company name 
  
* save dataset 
  compress 
  save "data/clean/second_measure/firm_naics6_crosswalk.dta" , replace 
  
  
*===============================================================================
* Firm Level Data : Firm X Sales/Transactions X State X Day Data 
*===============================================================================

* load data 
import delimited using "data/raw/second_measure/trends_by_geo_day.csv" , ///
	   clear varnames(1) 
	   
* preview data 
  describe 
  mdesc 
  
* save a temp data as the csv takes forever to load 
  drop observed_transactions sales_per_customer avg_txn_value txns_per_customer 
	   
  compress
  save "data/raw/second_measure/trends_by_geo_day.dta" , replace 
  

* load temp file 
  u "data/raw/second_measure/trends_by_geo_day.dta" , clear 
 
* get date variables from rollup_start_date variable 
  gen date = date(rollup_start_date, "YMD") 
  format date %td
   
* keep only a subset of data to help with cleaning 
  g year = yofd(date)
  keep if year>=2019 
  unique(geographic_location) 
  count if missing(geographic_location) 
  keep if !missing(geographic_location) 

* convert strings to lower case
  foreach var in company name   { 
	 replace `var' = lower(`var') 
	} 
	
* label variables : from Data Feeds Info Pdf in dataRAW folder 
  lab var company "Company ID" 
  lab var ticker  "US Exchange Ticker" 

* get naics codes 
  replace name = "hebparent" if name == "h-e-b" & company =="heb" // correct a name for merging 
  replace name = "metropcs"  if name == "metro by t-mobile" & company == "metropcs" 
  merge m:1 name using "data/clean/second_measure/firm_naics6_crosswalk.dta" 
  list if _merge ==2 
  keep if _merge !=2  
  drop _merge 
  
* rename var 
  rename geographic_location state
  rename observed_sales sales 
 
* keep necessary variables 
  keep company name date naics6 sales year  state observed_customers 

* save a firm level dataset 
  compress 
  save "data/clean/second_measure/firms_geo_daily_sales.dta" , replace 
  
*===============================================================================
* Sector Level Data : Sector X Sales/Transactions X State X Day Data 
*===============================================================================

* load firm level data 
u "data/clean/second_measure/firms_geo_daily_sales.dta" , clear 
keep if !missing(naics6) 

* extract 2-digit NAICS code 
  gen naics2 = substr(naics6,1,2)
  destring naics2 , replace 
  tab naics2 , m 
  
* generate category labels for NAICS2
* source : https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html
  gen sector = "" 
  replace sector = "agriculture, forestry, fishing and hunting"  	if naics2 == 11 
  replace sector = "mining, quarrying, and oil and gas extraction"  if naics2 == 21 
  replace sector = "utilities"										if naics2 == 22 
  replace sector = "construction" 									if naics2 == 23 
  replace sector = "manufacturing"									if inrange(naics2,31,33)
  replace sector = "wholesale trade"								if naics2 == 42 
  replace sector = "retail trade"									if naics2 == 44 | naics2 == 45 
  replace sector = "transportation"									if naics2 == 48 
  replace sector = "food delivery"			    					if naics2 == 49 
  replace sector = "information"									if naics2 == 51 
  replace sector = "finance & insurance" 							if naics2 == 52 
  replace sector = "real estate, rental & leasing"					if naics2 == 53 
  replace sector = "professional, scientific & technical services"  if naics2 == 54 
  replace sector = "management of companies & enterprises" 			if naics2 == 55 
  replace sector = "hotel booking" 									if naics2 == 56 
  replace sector = "educational services" 							if naics2 == 61
  replace sector = "health care & social assistance" 				if naics2 == 62 
  replace sector = "arts, entertainment & recreation"				if naics2 == 71 
  replace sector = "accommodation & food services"					if naics2 == 72 
  replace sector = "other services (except public administration)"	if naics2 == 81 
  replace sector = "public administration" 							if naics2 == 92 
  
  
* save naics2 sector codes as strings for graph names 
  tostring naics2, replace 
  replace naics2 = "31-33" if naics2 == "31" | naics2 == "33" | naics2 == "32"
  replace naics2 = "44-45" if naics2 == "44" | naics2 == "45" 
  
* aggregate sales to sector level 
  bys naics2 state date : egen sales_sector = total(sales)
  bys naics2 state date : egen customers_sector = total(observed_customers) 
   
* keep one observation per sector X date 
  bys naics2 state date : keep if _n == 1 
  
* keep necessary variables 
  keep naics2 sector date *_sector state date year 
  
* label variables 
  lab var naics2   			    "NAICS 2 digit code" 
  lab var sector   			    "Sector" 
  lab var sales_sector 			"Sector level daily sales"
  lab var customers_sector      "Sector Level Customers" 
  
 * save dataset 
  compress 
  save "data/clean/second_measure/sector_geo_daily_sales.dta" , replace 
  
*===============================================================================
* Add New Sectors 
*===============================================================================

* add new sectors 
  u "data/clean/second_measure/firms_geo_daily_sales.dta" , clear 
  
  generate sector = "pharmacies" 	            if inlist(company, "cvs" , "wba" , "rad" , "pharmaca")
  replace  sector = "grocery stores"	        if inlist(company, "wfm" , "publix" , "kroger" , "traderjoes" , ///
																  "heb" , "tomthumb" , "sprouts" , "aldi" )
  replace  sector  = "general/wholesale retail" if inlist(company, "amazonbrand" , "walmart" , "cost" , ///
															        "targetbrand" , "bjswholesale" , ///
															        "samsclub") 
  replace sector  = "amazon" 				    if inlist(company, "amazonbrand") 
  
  replace sector = "restaurants" 				if inlist(naics6, "722511" , "722513" , "722514" ) 
  
* keep these sectors only 
  keep if !missing(sector) 
  
* aggregate sales to sector level 
  bys sector state date : egen sales_sector = total(sales)
  bys sector state date : egen customers_sector = total(observed_customers) 
   
* keep one observation per sector X date 
  bys sector state date : keep if _n == 1 
  
* keep necessary variables 
  keep  sector date *_sector state date year 
  

* append to sector data 
  append using "data/clean/second_measure/sector_geo_daily_sales.dta"
  
* add state fips 
  ren state state_usps 
  merge m:1 state_usps using "data/raw/second_measure/statefips_stateabbrev_xwalk.dta" , assert(2 3) ///
			keep(3) nogen 
  destring state_fips , replace 
  format state_fips %02.0f 
  drop state_name  
  rename state_usps state 
  
* add normalization factor 
  preserve 
   import delimited using "data/raw/second_measure/Second_Measure_geographic_representativeness.csv." , ///
		clear varnames(1) 
  ren normalization_factor nm_factor 
  tempfile nm_factor 
  save `nm_factor' , replace 
  restore 
  
  merge m:1 state using `nm_factor' , assert(2 3) keep(3) nogen 

  
  compress 
  save "data/clean/second_measure/sector_geo_daily_sales.dta" , replace 
  
*===============================================================================
* Add Overall category to sector_geo_daily_sales 
*===============================================================================

* load data 
u "data/clean/second_measure/sector_geo_daily_sales.dta" , clear 

* drop created sectors
drop if inlist(sector , "amazon" , "grocery stores" , "general/wholesale retail" , "pharmacies" , "restaurants") 

* collapse to daily revenue 
collapse (sum) sales_sector customers_sector (first) state_fips year nm_factor  , by(state date) 
g sector = "overall" 


append using "data/clean/second_measure/sector_geo_daily_sales.dta" 

save "data/clean/second_measure/sector_geo_daily_sales.dta" , replace 

*===============================================================================
* Add New Sectors - Retail , Non - Retail , Amazon , Food - Delivery 
*===============================================================================

* load data 
  u "data/clean/second_measure/firms_geo_daily_sales.dta" , clear 
  keep if !missing(naics6) 

  
* extract 2-digit NAICS code 
  gen naics2 = substr(naics6,1,2)
  destring naics2 , replace 
  tab naics2 , m 

* get new sectors : retail and non - retail 
  drop if naics2 == 49 //eliminate food delivery 
  drop if company == "amazonbrand"  // eliminate amazon 
  g sector = "retail" if (naics2 == 44 | naics2 == 45) 
  replace sector = "non-retail" if missing(sector) 
  tab sector 
  
* aggregate sales to sector level 
  bys sector state date : egen sales_sector = total(sales)
  bys sector state date : egen customers_sector = total(observed_customers) 
   
* keep one observation per sector X date 
  bys sector state date : keep if _n == 1 
  
 * add state fips 
  ren state state_usps 
  merge m:1 state_usps using "data/raw/second_measure/statefips_stateabbrev_xwalk.dta" , assert(2 3) ///
			keep(3) nogen 
  destring state_fips , replace 
  format state_fips %02.0f 
  drop state_name  
  rename state_usps state 
  
* add normalization factor 
  preserve 
   import delimited using "data/raw/second_measure/Second_Measure_geographic_representativeness.csv." , ///
		clear varnames(1) 
  ren normalization_factor nm_factor 
  tempfile nm_factor 
  save `nm_factor' , replace 
  restore 
  
  merge m:1 state using `nm_factor' , assert(2 3) keep(3) nogen 


* keep necessary variables 
  keep  sector date *_sector state date year state_fips 

* append to sector data 
  append using "data/clean/second_measure/sector_geo_daily_sales.dta"

compress 
save "data/clean/second_measure/sector_geo_daily_sales.dta" , replace 
						
  
  
*===============================================================================
* Firm Level Data : Firm X Channel X Sales/Transactions X Day Data 
*===============================================================================

* load data 
import delimited using "data/raw/second_measure/daily_brand_channel_sales_transactions_5_day_lag.csv" , ///
	   clear varnames(1) 
	   
* preview data 
  describe 
  mdesc 

* get date variables from rollup_start_date variable 
  gen date2 = date(date, "YMD") 
  format date2 %td
  rename date rollup_start_date 
  rename date2 date 
  drop rollup_*
  
* convert strings to lower case 
  foreach var in company brand name channel { 
	 replace `var' = lower(`var') 
	} 
	
* label variables : from Data Feeds Info Pdf in dataRAW folder 
  lab var company "Company ID" 
  lab var brand   "Brand ID : subsidiary of company" 
  lab var name    "Company legal name" 
  lab var ticker  "US Exchange Ticker" 

* get naics codes 
  replace name = "hebparent" if name == "h-e-b" & company =="heb" // correct a name for merging 
  replace name = "metropcs"  if name == "metro by t-mobile" & company == "metropcs" 
  merge m:1 name using "data/clean/second_measure/firm_naics6_crosswalk.dta" , keep(1 3) nogen 
 
* replace channel values 
  replace channel = "online" if channel == "online/catalog" 
  
* aggregate sales/transactions to company x channel date level 
  bys company channel date : egen total_sales  = total(sales) 
  
* keep one observation per company X date 
  bys company channel date : keep if _n==1 
   
* keep only a subset of the date
  g year = yofd(date) 
  keep if year>=2019 
  
* keep necessary variables 
  keep company name date total_sales channel date  naics6 
	   
 * save dataset 
  compress 
  save "data/clean/second_measure/firm_channel_daily_sales.dta" , replace 
  
*===============================================================================
* Sector Level Data : Sector X Channel X Sales/Transactions X Day Data 
*===============================================================================

* load firm level data 
u "data/clean/second_measure/firm_channel_daily_sales.dta" , clear 
keep if !missing(naics6) 

* extract 2-digit NAICS code 
  gen naics2 = substr(naics6,1,2)
  destring naics2 , replace 
  tab naics2 , m 
  
* generate category labels for NAICS2
* source : https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html
  gen sector = "" 
  replace sector = "agriculture, forestry, fishing and hunting"  	if naics2 == 11 
  replace sector = "mining, quarrying, and oil and gas extraction"  if naics2 == 21 
  replace sector = "utilities"										if naics2 == 22 
  replace sector = "construction" 									if naics2 == 23 
  replace sector = "manufacturing"									if inrange(naics2,31,33)
  replace sector = "wholesale trade"								if naics2 == 42 
  replace sector = "retail trade"									if naics2 == 44 | naics2 == 45 
  replace sector = "transportation"									if naics2 == 48 
  replace sector = "food delivery"			    					if naics2 == 49 
  replace sector = "information"									if naics2 == 51 
  replace sector = "finance & insurance" 							if naics2 == 52 
  replace sector = "real estate, rental & leasing"					if naics2 == 53 
  replace sector = "professional, scientific & technical services"  if naics2 == 54 
  replace sector = "management of companies & enterprises" 			if naics2 == 55 
  replace sector = "hotel booking" 									if naics2 == 56 
  replace sector = "educational services" 							if naics2 == 61
  replace sector = "health care & social assistance" 				if naics2 == 62 
  replace sector = "arts, entertainment & recreation"				if naics2 == 71 
  replace sector = "accommodation & food services"					if naics2 == 72 
  replace sector = "other services (except public administration)"	if naics2 == 81 
  replace sector = "public administration" 							if naics2 == 92 
  
  
* save naics2 sector codes as strings for graph names 
  tostring naics2, replace 
  replace naics2 = "31-33" if naics2 == "31" | naics2 == "33" | naics2 == "32"
  replace naics2 = "44-45" if naics2 == "44" | naics2 == "45" 
  
 
 * save dataset 
  compress 
  save "data/clean/second_measure/sector_channel_daily_sales.dta" , replace 
  
*===============================================================================
* Add New Sectors 
*===============================================================================

* add new sectors 
  u "data/clean/second_measure/firm_channel_daily_sales.dta" , clear 
  
  generate sector = "pharmacies" 	            if inlist(company, "cvs" , "wba" , "rad" , "pharmaca")
  replace  sector = "grocery stores"	        if inlist(company, "wfm" , "publix" , "kroger" , "traderjoes" , ///
																  "heb" , "tomthumb" , "sprouts" , "aldi" )
  replace  sector  = "general/wholesale retail" if inlist(company, "amazonbrand" , "walmart" , "cost" , ///
															        "targetbrand" , "bjswholesale" , ///
															        "samsclub") 
  replace sector  = "amazon" 				    if inlist(company, "amazonbrand") 
  replace sector = "restaurants" 				if inlist(naics6, "722511" , "722513" , "722514" ) 
 
* keep these sectors only 
  keep if !missing(sector) 
  
* merge on date channel company 
  append using "data/clean/second_measure/sector_channel_daily_sales.dta"
  
* aggregate sales to  sector level 
  bys  sector date : egen sales = total(total_sales) 
  bys  sector date : keep if _n ==1 
  drop channel naics2 naics6 total_sales name company 
  
* save dataset 
  compress 
  save "data/clean/second_measure/sector_channel_daily_sales.dta" , replace 


